datanode节点大量长驻FIN_WAIT1 socket

有个hadoop集群的datanode节点,datanode rpc端口50010, netstat -antp|grep 50010会发现大量FIN_WAIT1状态的socket,且长时间不消失,看了socket的对端机器,发现同一个连接,显示的状态一直是established. 根据网上的资料,做了一个实验,重现了这个现象

datanode机器:

1
2
#fin_wait, socket发送缓冲区大量堵包
tcp 0 517122 10.42.67.209:50010 10.42.51.120:41218 FIN_WAIT1 -

对端机器:

1
2
#established, socket接收端缓冲区大量堵包
tcp 527752 0 10.42.51.120:41218 10.42.67.209:50010 ESTABLISHED 4765/java

server.cpp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <malloc.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include <stdarg.h>
#include <fcntl.h>

int main()
{
int sockSrv = socket(AF_INET, SOCK_STREAM, 0);

struct sockaddr_in addrSrv;
addrSrv.sin_family = AF_INET;
addrSrv.sin_addr.s_addr = INADDR_ANY;
addrSrv.sin_port = htons(8765);

bind(sockSrv, (const struct sockaddr *)&addrSrv, sizeof(struct sockaddr_in));

listen(sockSrv, 2);

struct sockaddr_in addrClient;
int len = sizeof(struct sockaddr_in);

int sockConn = accept(sockSrv, (struct sockaddr *)&addrClient, (socklen_t*)&len);

while(1)
{
// server端,如果不执行会车,会导致server一直阻塞在这里,不消费socket接收缓冲区,造成接收缓冲区堵塞
getchar();
char szRecvBuf[50001] = {0};
int iRet = recv(sockConn, szRecvBuf, sizeof(szRecvBuf) - 1, 0);
printf("iRet is %d\n", iRet);
}

close(sockConn);
close(sockSrv);

return 0;
}

// g++ server.cpp -o server

client.cpp:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#include <unistd.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <malloc.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include <stdarg.h>
#include <fcntl.h>

int main()
{
int sockClient = socket(AF_INET, SOCK_STREAM, 0);

struct sockaddr_in addrSrv;
addrSrv.sin_addr.s_addr = inet_addr("10.9.174.174");
addrSrv.sin_family = AF_INET;
addrSrv.sin_port = htons(8765);
connect(sockClient, ( const struct sockaddr *)&addrSrv, sizeof(struct sockaddr_in));

#define N 2000
char szSendBuf[N] = {0};
for(unsigned int i = 0; i < N; i++) //字符数组最后一个字符不要求是‘\0’
{
szSendBuf[i] = 'a';
}

int total = 0;
while(1)
{
int iRet = send(sockClient, szSendBuf, sizeof(szSendBuf) , 0);
total += iRet;
printf("iRet is %d, total send is %d\n", iRet, total);
//启动后,会阻塞在这里,每次键盘会车一次,就会发一次包到server
getchar();
}

close(sockClient);

return 0;
}


// g++ client.cpp -o client

这里如果多个client发起请求,netstat -antp|grep 8765, listen socket会看到recv-q数值会不断增大(代表current syn backlog),因为新的客户端请求并没有accept去处理,server端,新的socket的接收缓冲区也会会不断增大(tcpdump抓包看到新连接的三次握手已经完成,server端socket接收缓冲区已经在堆积,在缓冲区未满,还是能接收客户端发过来的数据,但是无法消费)。这个时候,即使client退出,server端listen socket的 syn backlog也不会减少。

1
2
3
4
5
tcp        3      0 0.0.0.0:8765            0.0.0.0:*               LISTEN      20479/./server
tcp 2001 0 10.9.174.174:8765 10.9.153.134:34150 CLOSE_WAIT -
tcp 0 0 10.9.174.174:8765 10.9.153.134:32894 CLOSE_WAIT 20479/./server
tcp 4001 0 10.9.174.174:8765 10.9.153.134:32980 CLOSE_WAIT -
tcp 2001 0 10.9.174.174:8765 10.9.153.134:33570 CLOSE_WAIT -

单个客户端不断回车发送消息给服务端,服务端socket如果接收缓冲区未满,会接收数据,并回复确认包,这个过程中,server端socket接收缓冲区不断堆积,告知客户端端滑动窗口先会上升,然后慢慢下降,因为缓冲区可用空间在不断下降。同时,如果server端socket接收窗口变为0后,client继续发送包,无法送到server端, 会看到client端socket 发送缓冲区开始堵包。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# tcpdump -i eth0 port 8765

17:51:31.471769 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [P.], seq 65403:66001, ack 1, win 221, options [nop,nop,TS val 1561362974 ecr 1561340287], length 598
17:51:31.511212 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 66001, win 587, options [nop,nop,TS val 1561363820 ecr 1561362974], length 0
17:51:31.650382 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [.], seq 66001:67403, ack 1, win 221, options [nop,nop,TS val 1561363153 ecr 1561363820], length 1402
17:51:31.650387 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [P.], seq 67403:68001, ack 1, win 221, options [nop,nop,TS val 1561363153 ecr 1561363820], length 598
17:51:31.690253 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 68001, win 572, options [nop,nop,TS val 1561363999 ecr 1561363153], length 0
17:51:31.989746 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [.], seq 68001:69403, ack 1, win 221, options [nop,nop,TS val 1561363492 ecr 1561363999], length 1402
17:51:31.989764 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [P.], seq 69403:70001, ack 1, win 221, options [nop,nop,TS val 1561363492 ecr 1561363999], length 598
17:51:32.029119 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 70001, win 557, options [nop,nop,TS val 1561364338 ecr 1561363492], length 0
17:51:32.469041 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [.], seq 70001:71403, ack 1, win 221, options [nop,nop,TS val 1561363972 ecr 1561364338], length 1402
17:51:32.469060 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [P.], seq 71403:72001, ack 1, win 221, options [nop,nop,TS val 1561363972 ecr 1561364338], length 598
17:51:32.509158 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 72001, win 542, options [nop,nop,TS val 1561364818 ecr 1561363972], length 0
17:51:32.820495 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [.], seq 72001:73403, ack 1, win 221, options [nop,nop,TS val 1561364323 ecr 1561364818], length 1402
17:51:32.820504 IP uhadoop-0b232ove-master2.37154 > uhadoop-0b232ove-master1.ultraseek-http: Flags [P.], seq 73403:74001, ack 1, win 221, options [nop,nop,TS val 1561364323 ecr 1561364818], length 598
17:51:32.860173 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 74001, win 527, options [nop,nop,TS val 1561365169 ecr 1561364323], length 0
...
18:35:01.617225 IP uhadoop-0b232ove-master1.ultraseek-http > uhadoop-0b232ove-master2.37154: Flags [.], ack 332769, win 0, options [nop,nop,TS val 1563973926 ecr 1563491546], length 0

# 最后窗口变为0